SF3B1 iCLIP analysis

Differential binding

Author
Affiliation
Dr. Mirko Brueggemann

Buchman Institute for Molecular Life Sciences

Published

September 19, 2023

1 Analysis description

2 Load libraries

Show code
library(rtracklayer)
library(GenomicRanges)
library(ggplot2)
library(AnnotationDbi)
library(dplyr)
library(reshape2)
library(UpSetR)
library(GenomicFeatures)
library(kableExtra)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(viridis)
library(BiocParallel)
library(DESeq2)
library(gplots)
library(ComplexHeatmap)
library(circlize)
library(GGally)
library(factoextra)
library(plotROC)
library(PRROC)
library(ggrastr)
library(ggbeeswarm)
library(ggpointdensity)
library(tidyr)
library(ggridges)
library(ggsci)
library(ggnewscale)
library(ggrastr)
library(Cairo)
library(ggforce)
library(patchwork)
library(dplyr)
library(tibble)
library(forcats)
library(stringr)
library(RColorBrewer)
library(ggpubr)
library(BindingSiteFinder)
library(Gviz)
library(ggpubr)
Show code
source("../styles.R")
source("../helper.R")

3 Differential binding

Here we test whether SF3B1 changes its affinity to defined binding sites in the mutant condition (compared to the wt). This is done by using the DESeq2 NB model with the LRT (likelihood ratio test), to compare changes in binding sites to changes in the respective hosting gene. Essentially, we account for RNA abundance changes by approximating the transcript expression level by all iCLIP counts that do not end up in a binding site. We call this background counts. The DESeq2 model essentially uses these background counts to find binding sites that change independently from the underlying transcript level change, thus disentangling both signals.

Show code
load("/Users/mirko/Projects/Annotations/human/gencode_36/filtered/gencode_v36_filtered.rda")
anno.db = loadDb("/Users/mirko/Projects/Annotations/human/gencode_36/filtered/gencode_v36_filtered.sqlite")
gns = genes(anno.db)
idx = match(gns$gene_id, anno$gene_id)
elementMetadata(gns) = cbind(elementMetadata(gns), elementMetadata(anno)[idx,])
names(gns) = sub("\\..*", "", names(gns))
meta = data.frame(gene_id = gns$gene_id, gene_name = gns$gene_name, gene_type = gns$gene_type)
mcols(gns) = meta
gns$geneID = names(gns)
Show code
load("/Users/mirko/Projects/sf3b1/02_markdowns/01_transcriptome/01_bindingSites/data/bsTranscript.rda")
bindingSites = bsTranscript

# Load clip data
clipFilesWt = "/Users/mirko/Projects/sf3b1/01_data_subsamp/wt/cov/replicate"
clipFilesMut = "/Users/mirko/Projects/sf3b1/01_data_subsamp/mut/cov/replicate"
clipFiles = c(clipFilesWt, clipFilesMut)
clipFiles = list.files(clipFiles, pattern = ".bw$", full.names = TRUE)
clipFilesP = clipFiles[grep(clipFiles, pattern = "Plus")]
clipFilesM = clipFiles[grep(clipFiles, pattern = "Minus")]
# Organize clip data in dataframe
colData = data.frame(
    id = c(1:5),
    condition = factor(c("MUT", "MUT", "WT", "WT", "WT"), levels = c("MUT", "WT")),
    clPlus = clipFilesP,
    clMinus = clipFilesM)
# Make BindingSiteFinder object
bds = BSFDataSetFromBigWig(ranges = bindingSites, meta = colData)
Show code
# compute the binding site and background coverage
bds.diff = calculateBsBackground(bds, anno.genes = gns)

# filter background 
bds.diff = filterBsBackground(bds.diff)

# compute fold-changes
bds.diff = calculateBsFoldChange(bds.diff)

3.1 Gene-wise pre-filtering

Show code
plotBsBackgroundFilter(bds.diff, filter = "minCounts")

minCount filter

Show code
plotBsBackgroundFilter(bds.diff, filter = "balanceBackground") 

balanceBackground filter

Show code
plotBsBackgroundFilter(bds.diff, filter = "balanceCondition")

balanceCondition filter

3.2 Binding site level results

Show code
df = getRanges(bds.diff) %>%
    as.data.frame() %>%
    mutate(bs.padj = replace_na(bs.padj, 1)) %>%
    mutate(sig = ifelse(bs.padj < 0.05, TRUE, FALSE)) %>% 
    group_by(sig, region) %>%
    summarize(n = myFormat(n())) 

kable(df, caption = "Result overview") %>% 
  kable_styling("striped") %>%
  scroll_box(width = "100%")
Result overview
sig region n
FALSE cds 3,541
FALSE intron 87,510
FALSE utr3 1,652
FALSE utr5 311
TRUE cds 2
TRUE intron 189
TRUE utr3 6
TRUE utr5 1
Show code
plotBsMA(bds.diff)

MA plot

Show code
plotBsVolcano(bds.diff)

Volcano plot

3.3 Gene level results

Show code
df = getRanges(bds.diff) %>%
    as.data.frame() %>%
    mutate(bg.padj = replace_na(bg.padj, 1)) %>%
    mutate(sig = ifelse(bg.padj < 0.05, TRUE, FALSE)) %>% 
    group_by(sig, region) %>%
    summarize(n = myFormat(n())) 

kable(df, caption = "Result overview") %>% 
  kable_styling("striped") %>%
  scroll_box(width = "100%")
Result overview
sig region n
FALSE cds 1,960
FALSE intron 50,344
FALSE utr3 980
FALSE utr5 171
TRUE cds 1,583
TRUE intron 37,355
TRUE utr3 678
TRUE utr5 141
Show code
plotBsMA(bds.diff, what = "bg")

MA plot

Show code
plotBsVolcano(bds.diff, what = "bg")

Volcano plot

4 Integration with peak clustering

First binding sites are overlapped with classified peak regions from the clustering approach. This results in binding sites overlapping one of the four categories (DoubleWide, DoubleNarrow, Single, Rest). The DoubleWide peak class is further split in the left and right side. Each side can overlap with multiple binding sites, resulting in muliple LFCs, P-values, ect for each side. To resolve the issue values from the binding site with the lowest P value were taken as representative.

Show code
# load peak classification from clustering
peakClass = rtracklayer::import.bed("../02_peakClassification/data/rngClassified.bed")
peakClass$group = sapply(strsplit(peakClass$name,"_"), `[`, 1)

# group peaks by classification
peakList = split(peakClass, peakClass$group)
# bsRes = searchRes$obj
bsRes = getRanges(bds.diff)

# get binding site LFCs, P-values, ect for binding sites in peak regions
olBs = subsetByOverlaps(bsRes,peakList$DoubleNarrow)
df1 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "DoubleNarrow")
olBs = subsetByOverlaps(bsRes,peakList$DoubleWide)
df2 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "DoubleWide")
olBs = subsetByOverlaps(bsRes,peakList$Rest)
df3 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "Rest")
olBs = subsetByOverlaps(bsRes,peakList$SinglePeak)
df4 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "SinglePeak")

# split double-wide peaks in left and right side
# -> based on midpoint
# -> left/ right switches with the strand
doublePeaks = peakList$DoubleWide
doublePeaks$doublePeakID = doublePeaks$name
doublePeaksP = doublePeaks[strand(doublePeaks) == "+"]
doublePeaksM = doublePeaks[strand(doublePeaks) == "-"]
doublePeaksP = as(slidingWindows(x = doublePeaksP, width = 41, step = 41), "GRangesList")
doublePeaksM = as(slidingWindows(x = doublePeaksM, width = 41, step = 41), "GRangesList")

doublePart1P = as(lapply(doublePeaksP, function(x){x[1]}),"GRangesList") %>% unlist()
doublePart2P = as(lapply(doublePeaksP, function(x){x[2]}),"GRangesList") %>% unlist()
doublePart1M = as(lapply(doublePeaksM, function(x){x[2]}),"GRangesList") %>% unlist()
doublePart2M = as(lapply(doublePeaksM, function(x){x[1]}),"GRangesList") %>% unlist()

doublePart1 = c(doublePart1P, doublePart1M)
mcols(doublePart1)$doublePeakID = doublePeaks$name
export(doublePart1, con = "./data/LeftPartFar.bed", format = "BED")
doublePart2 = c(doublePart2P, doublePart2M)
mcols(doublePart2)$doublePeakID = doublePeaks$name
export(doublePart2, con = "./data/RightPartClose.bed", format = "BED")

olBs = subsetByOverlaps(bsRes, doublePart1)
df5 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "DoubleWide-Left")
olBs = subsetByOverlaps(bsRes, doublePart2)
df6 = data.frame(BsID = olBs$bsID, GeneID = olBs$geneID, lfc = olBs$bs.log2FoldChange, padj = olBs$bs.padj, peakType = "DoubleWide-Right")

4.1 Grouping by peak classification

4.1.1 All

Show code
df = rbind(df1,df2,df3,df4,df5,df6)
df$peakType = factor(df$peakType, levels = c("DoubleWide", "DoubleNarrow", "SinglePeak", "Rest", "DoubleWide-Left", "DoubleWide-Right"))

p1 = ggplot(df, aes(x = peakType, y = lfc, fill = peakType)) +
  geom_violin(adjust = 1, trim = T) +
  geom_boxplot(width = 0.5, fill = "white", outlier.size = 0.5) +
  theme_nice() +
  scale_fill_npg() +
  theme(legend.position = "none") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "", y = "Fold-change (log2)") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

p1

Violin chart

Show code
p2 = ggplot(df, aes(x = peakType, y = lfc, fill = peakType)) +
  geom_boxplot(outlier.colour = NA) +
  theme_nice() +
  scale_fill_npg() +
  theme(legend.position = "none") +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "", y = "Fold-change (log2)") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  coord_cartesian(ylim = c(-0.7,0.7)) +
  stat_compare_means(comparisons = list(c("DoubleWide-Left", "DoubleWide-Right")), label = "p.signif", method = "wilcox.test", label.y = 0.25) 

p2

Boxplot

Show code
d = df %>% group_by(peakType) %>% summarise(n = n())
p3 = ggplot(d, aes(x = peakType, y = n, fill = peakType)) + 
  geom_col() +
  scale_fill_npg() +
  geom_text(aes(label = n), vjust=-0.3, size = 2) +
  theme_nice() +
  theme(legend.position = "none") +
  labs(x = "", y = "N (BS)") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 

p3

Barchart

4.1.2 Cleaned

Show code
df = rbind(df1,df2,df3,df4,df5,df6) %>% filter(peakType != "Rest")
df$peakType = factor(df$peakType, levels = c("SinglePeak", "DoubleNarrow", "DoubleWide", "DoubleWide-Left", "DoubleWide-Right"))
compList = list(c("DoubleWide-Left", "DoubleWide-Right"), c("DoubleWide", "DoubleWide-Right"), c("DoubleWide-Left", "DoubleWide"))

# count number of obs per box
d = df %>% 
    group_by(peakType) %>% 
    summarise(n = n()) %>%
    mutate(nNice = paste0("N=", format(n, big.mark = ".", decimal = ","))) 

p1 = ggplot(df, aes(x = peakType, y = lfc, fill = peakType)) +
    geom_boxplot(outlier.colour = NA) +
    theme_pub() +
    scale_fill_npg() +
    theme(legend.position = "none") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(x = "", y = "Fold-change (log2)") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    coord_cartesian(ylim = c(-1,1)) +
    stat_compare_means(comparisons = compList, label = c("p.format"), method = "wilcox.test", label.y = c(0.45,0.6,0.3), tip.length = 0.01, size = 2) +
    geom_text(data = d, aes(x = peakType, y = -0.8, label = nNice), size = 2, angle = 90) 
p1

V1

Show code
p2 = ggplot(df, aes(x = peakType, y = lfc, fill = peakType)) +
    geom_boxplot(outlier.colour = NA) +
    theme_pub() +
    scale_fill_grey() +
    theme(legend.position = "none") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(x = "", y = "Fold-change (log2)") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    coord_cartesian(ylim = c(-1,1)) +
    stat_compare_means(comparisons = compList, label = c("p.format"), method = "wilcox.test", label.y = c(0.45,0.6,0.3), tip.length = 0.01, size = 2) +
    geom_text(data = d, aes(x = peakType, y = -0.8, label = nNice), size = 2, angle = 90) 

p2

V2

5 Session Information

Show code
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] Gviz_1.41.1                 BindingSiteFinder_1.7.8    
 [3] ggpubr_0.6.0                RColorBrewer_1.1-3         
 [5] stringr_1.5.0               forcats_1.0.0              
 [7] tibble_3.2.1                patchwork_1.1.2            
 [9] ggforce_0.4.1               Cairo_1.6-0                
[11] ggnewscale_0.4.9            ggsci_3.0.0                
[13] ggridges_0.5.4              tidyr_1.3.0                
[15] ggpointdensity_0.1.0        ggbeeswarm_0.7.2           
[17] ggrastr_1.0.2               PRROC_1.3.1                
[19] plotROC_2.3.0               factoextra_1.0.7           
[21] GGally_2.1.2                circlize_0.4.15            
[23] ComplexHeatmap_2.14.0       gplots_3.1.3               
[25] DESeq2_1.37.6               SummarizedExperiment_1.27.3
[27] MatrixGenerics_1.9.1        matrixStats_1.0.0          
[29] BiocParallel_1.31.13        viridis_0.6.3              
[31] viridisLite_0.4.2           gridExtra_2.3              
[33] ggrepel_0.9.3               knitr_1.43                 
[35] kableExtra_1.3.4            GenomicFeatures_1.49.7     
[37] UpSetR_1.4.0                reshape2_1.4.4             
[39] dplyr_1.1.2                 AnnotationDbi_1.59.1       
[41] Biobase_2.57.1              ggplot2_3.4.2              
[43] rtracklayer_1.57.0          GenomicRanges_1.49.1       
[45] GenomeInfoDb_1.33.10        IRanges_2.31.2             
[47] S4Vectors_0.35.4            BiocGenerics_0.43.4        

loaded via a namespace (and not attached):
  [1] utf8_1.2.3               tidyselect_1.2.0         RSQLite_2.3.1           
  [4] htmlwidgets_1.6.2        munsell_0.5.0            codetools_0.2-19        
  [7] interp_1.1-4             withr_2.5.0              colorspace_2.1-0        
 [10] filelock_1.0.2           highr_0.10               rstudioapi_0.14         
 [13] ggsignif_0.6.4           labeling_0.4.2           GenomeInfoDbData_1.2.9  
 [16] polyclip_1.10-4          bit64_4.0.5              farver_2.1.1            
 [19] vctrs_0.6.3              generics_0.1.3           xfun_0.39               
 [22] biovizBase_1.45.0        BiocFileCache_2.5.2      R6_2.5.1                
 [25] doParallel_1.0.17        clue_0.3-64              locfit_1.5-9.8          
 [28] AnnotationFilter_1.21.0  bitops_1.0-7             cachem_1.0.8            
 [31] reshape_0.8.9            DelayedArray_0.23.2      BiocIO_1.7.1            
 [34] scales_1.2.1             nnet_7.3-19              beeswarm_0.4.0          
 [37] gtable_0.3.3             ensembldb_2.21.5         rlang_1.1.1             
 [40] genefilter_1.79.0        systemfonts_1.0.4        GlobalOptions_0.1.2     
 [43] splines_4.2.1            lazyeval_0.2.2           rstatix_0.7.2           
 [46] dichromat_2.0-0.1        broom_1.0.5              checkmate_2.2.0         
 [49] yaml_2.3.7               abind_1.4-5              backports_1.4.1         
 [52] Hmisc_5.1-0              tools_4.2.1              Rcpp_1.0.10             
 [55] plyr_1.8.8               base64enc_0.1-3          progress_1.2.2          
 [58] zlibbioc_1.43.0          purrr_1.0.1              RCurl_1.98-1.12         
 [61] prettyunits_1.1.1        deldir_1.0-9             rpart_4.1.19            
 [64] GetoptLong_1.0.5         cluster_2.1.4            magrittr_2.0.3          
 [67] data.table_1.14.8        ggdist_3.3.0             ProtGenerics_1.29.1     
 [70] hms_1.1.3                evaluate_0.21            xtable_1.8-4            
 [73] XML_3.99-0.14            jpeg_0.1-10              shape_1.4.6             
 [76] compiler_4.2.1           biomaRt_2.53.3           KernSmooth_2.23-21      
 [79] crayon_1.5.2             htmltools_0.5.5          Formula_1.2-5           
 [82] geneplotter_1.75.0       DBI_1.1.3                tweenr_2.0.2            
 [85] dbplyr_2.3.2             MASS_7.3-60              rappdirs_0.3.3          
 [88] Matrix_1.5-4.1           car_3.1-2                cli_3.6.1               
 [91] parallel_4.2.1           pkgconfig_2.0.3          GenomicAlignments_1.33.1
 [94] foreign_0.8-84           xml2_1.3.4               foreach_1.5.2           
 [97] svglite_2.1.1            annotate_1.75.0          vipor_0.4.5             
[100] webshot_0.5.4            XVector_0.37.1           rvest_1.0.3             
[103] VariantAnnotation_1.43.3 distributional_0.3.2     digest_0.6.31           
[106] Biostrings_2.65.6        rmarkdown_2.22           htmlTable_2.4.1         
[109] restfulr_0.0.15          curl_5.0.1               Rsamtools_2.13.4        
[112] gtools_3.9.4             rjson_0.2.21             lifecycle_1.0.3         
[115] jsonlite_1.8.5           carData_3.0-5            BSgenome_1.65.2         
[118] fansi_1.0.4              pillar_1.9.0             lattice_0.21-8          
[121] KEGGREST_1.37.3          fastmap_1.1.1            httr_1.4.6              
[124] survival_3.5-5           glue_1.6.2               png_0.1-8               
[127] iterators_1.0.14         bit_4.0.5                stringi_1.7.12          
[130] blob_1.2.4               latticeExtra_0.6-30      caTools_1.18.2          
[133] memoise_2.0.1